# 安装: pip install langchain langchain-community

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 使用现成的方法
pdf_path = "documents/雷展-java.pdf"

# 1. 加载PDF - 使用现成的Loader
loader = PyPDFLoader(pdf_path)
documents = loader.load()  # 返回Document对象列表

print(f"加载了 {len(documents)} 页")

# 2. 文本分块 - 使用现成的TextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    separators=["\n\n", "\n", "。", ".", " ", ""]
)

text_chunks = text_splitter.split_documents(documents)
# 或者只分割文本：text_chunks = text_splitter.split_text(text)

print("Number of text chunks:", len(text_chunks))

# 查看内容
for i, chunk in enumerate(text_chunks[:3]):  # 只看前3个
    print(f"Chunk {i+1}: {chunk.page_content[:100]}...")